## Installing package into 'C:/Users/CYTech Student/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'webshot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\CYTech Student\AppData\Local\Temp\RtmpyCrgXF\downloaded_packages
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
library(scatterplot3d)
data(iris)

colors <- c("gray", "orange", "lightblue")[as.numeric(iris$Species)]

scatterplot3d(
  x = iris$Sepal.Length,   
  y = iris$Sepal.Width,    
  z = iris$Petal.Length,  
  
  color = colors, 
  
  pch = 16,                
  xlab = "Sepal.Length",    
  ylab = "Sepal.Width",     
  zlab = "Petal.Length",    
  angle = 45               
)

# Load necessary libraries
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data(iris)

#Interactive 3D plot
fig <- plot_ly(
  data = iris,
  x = ~Sepal.Length,
  y = ~Petal.Length,
  z = ~Sepal.Width,
  color = ~Species,  # Color points by species
  colors = c("gray", "orange", "lightblue")
) %>%
  add_markers() %>%
  layout(
    scene = list(
      xaxis = list(title = 'Sepal Length'),
      yaxis = list(title = 'Petal Length'),
      zaxis = list(title = 'Sepal Width')
    ),
    title = "Interactive scatter Plot of Iris Dataset"
  )

fig
#Non standardisation version
library(ggplot2)

data(iris)

centered_data <- scale(iris[, 1:4], center = TRUE, scale = FALSE)
pca_res <- prcomp(centered_data)

df <- as.data.frame(pca_res$x)
df$Species <- iris$Species

# Plot PCA without standardization
p1 <- ggplot(df, aes(x = PC1, y = PC2, color = Species, shape = Species)) +
  geom_point(size = 3) +
  labs(title = "", 
       x = "PC1", 
       y = "PC2") +
       
  theme_minimal() +
  
  scale_color_manual(values = c("blue", "orange", "darkgray")) +
  
  scale_shape_manual(values = c(16, 17, 15)) +  
  
  geom_hline(yintercept = 0, linetype = "dashed", size = 1.1) +
  
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.1) +
  
  theme(legend.title = element_text(size = 12), legend.position = "right",
        panel.background = element_rect(fill = "lightgrey"),  
        panel.grid.major = element_line(color = "white"),   
        panel.grid.minor = element_line(color = "white"),  
        panel.border = element_rect(color = "white", fill = NA))  # Set border to white
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)

library(plotly)

data(iris)

centered_data <- scale(iris[, 1:4], center = TRUE, scale = FALSE)
pca_res <- prcomp(centered_data)

df <- as.data.frame(pca_res$x)
df$Species <- iris$Species

p1_plotly <- plot_ly(df, 
                     x = ~PC1, 
                     y = ~PC2, 
                     color = ~Species, 
                     colors = c("blue", "orange", "darkgray"), 
                     
                     symbol = ~Species, 
                     
                     symbols = c("circle", "triangle-up", "square"),
                     
                     text = ~paste("Species:", Species, "PC1:", round(PC1, 2), "PC2:", round(PC2, 2)),
                     
                     mode = "markers") %>%
  
  layout(title = "No Standardization",
         
         xaxis = list(title = "PC1"),
         
         yaxis = list(title = "PC2"),
         
         plot_bgcolor = "lightgrey",
         
         paper_bgcolor = "white",
         
         shapes = list(
           list(type = "line", x0 = 0, x1 = 0, y0 = -3, y1 = 3, line = list(color = "black", dash = 'dash', width = 1.2)),
           
           list(type = "line", x0 = -3, x1 = 3, y0 = 0, y1 = 0, line = list(color = "black", dash = 'dash', width = 1.2))  
         ))

# Render the plot
p1_plotly
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
#standardised version
standardized_data <- scale(iris[, 1:4])
pca_res_std <- prcomp(standardized_data)

#dataframe
df_std <- as.data.frame(pca_res_std$x)
df_std$Species <- iris$Species

p2 <- ggplot(df_std, aes(x = PC1, y = PC2, color = Species, shape = Species)) +
  geom_point(size = 3) +
  labs(title = "", 
       x = "PC1", 
       y = "PC2") +
       
  theme_minimal() +
  
  scale_color_manual(values = c("blue", "orange", "darkgray")) +
  
  scale_shape_manual(values = c(16, 17, 15)) +
  
  geom_hline(yintercept = 0, linetype = "dashed", size = 1.1) +
  
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.1) +
  
  theme(legend.title = element_text(size = 12), legend.position = "right",
        panel.background = element_rect(fill = "lightgrey"),  
        panel.grid.major = element_line(color = "white"),   
        panel.grid.minor = element_line(color = "white"),  
        panel.border = element_rect(color = "white", fill = NA))  
                       
  
print(p2)

library(plotly)

data(iris)

standardized_data <- scale(iris[, 1:4])
pca_res_standardized <- prcomp(standardized_data)

df_standardized <- as.data.frame(pca_res_standardized$x)
df_standardized$Species <- iris$Species

p2_plotly <- plot_ly(df_standardized, 
                     x = ~PC1, 
                     y = ~PC2, 
                     color = ~Species, 
                     colors = c("blue", "orange", "darkgray"), 
                     symbol = ~Species, 
                     symbols = c("circle", "triangle-up", "square"),
                     text = ~paste("Species:", Species, "<br>PC1:", round(PC1, 2), "<br>PC2:", round(PC2, 2)),
                     mode = "markers") %>%
  layout(title = "PCA Plot (Standardized)",
         xaxis = list(title = "PC1"),
         yaxis = list(title = "PC2"),
         plot_bgcolor = "lightgrey",
         paper_bgcolor = "white",
         shapes = list(
           list(type = "line", x0 = 0, x1 = 0, y0 = -3, y1 = 3, line = list(color = "black", dash = 'dash', width = 1.2)),
           list(type = "line", x0 = -3, x1 = 3, y0 = 0, y1 = 0, line = list(color = "black", dash = 'dash', width = 1.2))
         ))

p2_plotly
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)

data(iris)

cor_matrix <- cor(iris[, 1:4])

cor_df <- as.data.frame(as.table(cor_matrix))

colnames(cor_df) <- c("Var1", "Var2", "value")

ggplot(cor_df, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), size = 4, color = "black") +
  scale_fill_gradient2(low = "red", high = "turquoise", mid = "gold", 
                       limit = c(-1, 1), name = "Correlation") +
  labs(title = "Correlogram of Iris Dataset", x = "", y = "") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.background = element_rect(fill = "lightgrey"),
        axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: package 'FactoMineR' was built under R version 4.3.3
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
iris_stand <- scale(iris[, -5])

#PCA on the standardized dataset
pca_res <- PCA(iris_stand, graph = FALSE)

fviz_pca_var(pca_res, col.var = "contrib",  # Color based on contribution
              title = "Circle of Correlations - Iris Dataset") +
  theme_minimal() +
  scale_color_gradient2(low = "blue", mid = "yellow", high = "red", 
                        midpoint = median(pca_res$var$contrib))  

#Note: made a gradient to see how much contribution each variable has
pca_result <- PCA(iris[, -5], graph = FALSE)

fviz_pca_biplot(pca_result, 
                 geom.ind = "point",
                 geom.var = "arrow", 
                 col.ind = iris$Species,  
                 addEllipses = TRUE,  
                 ellipse.type = "confidence",  
                 palette = c("blue", "orange", "gray"), 
                 title = "PCA - Biplot") +
  labs(x = "Dim1 ", y = "Dim2 ") +
  theme_minimal()

pca_result <- PCA(iris[, -5], graph = FALSE)

fviz_eig(pca_result, addlabels = TRUE) + 
  labs(title = "Scree Plot with curve", 
       x = "Principal Components", 
       y = "Variance (%)") +

  geom_line(aes(x = 1:length(pca_result$eig[,1]), 
                y = cumsum(pca_result$eig[,2])), 
            color = "red", size = 1) +

  geom_point(aes(x = 1:length(pca_result$eig[,1]), 
                 y = cumsum(pca_result$eig[,2])), 
             color = "red", size = 3)

library(FactoMineR)

pca_result <- PCA(iris[, -5], graph = FALSE)

eigenvalues <- pca_result$eig

eigen_table <- data.frame(
  Component = 1:nrow(eigenvalues),
  `Initial Eigenvalues` = round(eigenvalues[, 1], 3),
  `Variance` = round(eigenvalues[, 2], 3),
  `Cumulated Variance` = round(eigenvalues[, 3], 3)
)

print(eigen_table)
##        Component Initial.Eigenvalues Variance Cumulated.Variance
## comp 1         1               2.918   72.962             72.962
## comp 2         2               0.914   22.851             95.813
## comp 3         3               0.147    3.669             99.482
## comp 4         4               0.021    0.518            100.000

Ex9.)

data <- iris[, -5]

pca_result <- prcomp(data, scale = TRUE)

loadings <- pca_result$rotation %*% diag(pca_result$sdev)

print(loadings)
##                    [,1]        [,2]        [,3]        [,4]
## Sepal.Length  0.8901688 -0.36082989  0.27565767  0.03760602
## Sepal.Width  -0.4601427 -0.88271627 -0.09361987 -0.01777631
## Petal.Length  0.9915552 -0.02341519 -0.05444699 -0.11534978
## Petal.Width   0.9649790 -0.06399985 -0.24298265  0.07535950
strongest_cp1 <- loadings[which.max(abs(loadings[, 1])), ]
print(strongest_cp1)
## [1]  0.99155518 -0.02341519 -0.05444699 -0.11534978

Ex10.)

data <- iris[, -5]

pca_result <- prcomp(data, scale = TRUE)

loadings <- pca_result$rotation %*% diag(pca_result$sdev)

loadings_df <- as.data.frame(loadings)

#Identify the variable with the strongest saturation for each principal component
strongest_saturations <- apply(loadings_df, 2, function(x) rownames(loadings_df)[which.max(abs(x))])

results_table <- data.frame(
  Principal_Component = names(strongest_saturations),
  Strongest_Variable = strongest_saturations
)

print(results_table)
##    Principal_Component Strongest_Variable
## V1                  V1       Petal.Length
## V2                  V2        Sepal.Width
## V3                  V3       Sepal.Length
## V4                  V4       Petal.Length

Ex11.) Saturation and Arrow Length

Length of Arrows: Indicates strength of correlation. Long Arrows: Represent strong saturation with principal components. Short Arrows: Indicate weak correlation.

Orientation of Arrows

Direction: Shows relationship with principal components. Positive Contributions: Arrows pointing in the same direction as CP axis. Negative Contributions: Arrows pointing in the opposite direction.

Interpretation of Relationships

Understanding Contributions: Strong saturation indicates significant role in variance explanation. Inter connectedness: Close arrows suggest positive correlation between variables.

In other words… summary Arrow length = saturation strength. Arrow direction = positive/negative contribution to principal components.

Ex12.)

#based on cos^2
fviz_pca_var(pca_result, col.var = "cos2",
             gradient.cols = c("turquoise", "yellow", "red"),
             repel = TRUE) +
  labs(title = "Circle of Correlations (Colored by cos²)")

#Circle of correlations with color based on contributions
fviz_pca_var(pca_result, col.var = "contrib",
             gradient.cols = c("turquoise", "yellow", "red"),
             repel = TRUE) +
  labs(title = "Circle of Correlations")

The cos² metric indicates how well each variable is represented by the principal components, with higher values showing a closer alignment to the component axes, meaning the variable’s variance is well captured. Contribution, on the other hand, measures how much each variable shapes a particular component, indicating its influence on the component’s direction. While cos² focuses on representation quality, contribution emphasizes the impact of each variable on defining the component.

Ex13

fviz_pca_ind(pca_result, col.ind = "cos2",
             gradient.cols = c("turquoise", "yellow", "red"),
             repel = TRUE) +
  labs(title = "Individuals on Factorial Plane (Colored by cos²)")

#Extract contributions of individuals to first two axes
contrib_ind <- pca_res$ind$contrib[, 1:2]

contrib_table <- as.data.frame(contrib_ind)
colnames(contrib_table) <- c("Contribution to PC1", "Contribution to PC2")

print(contrib_table)
##     Contribution to PC1 Contribution to PC2
## 1          1.171580e+00        1.680655e-01
## 2          9.891845e-01        3.314667e-01
## 3          1.276816e+00        8.526419e-02
## 4          1.207737e+00        2.602978e-01
## 5          1.304631e+00        3.051656e-01
## 6          9.841236e-01        1.617488e+00
## 7          1.364464e+00        1.655648e-03
## 8          1.138852e+00        3.631904e-02
## 9          1.245057e+00        9.073044e-01
## 10         1.089896e+00        1.604423e-01
## 11         1.071990e+00        7.944959e-01
## 12         1.235998e+00        1.291703e-02
## 13         1.124214e+00        3.872730e-01
## 14         1.583742e+00        6.742993e-01
## 15         1.104326e+00        2.523484e+00
## 16         1.169007e+00        5.263227e+00
## 17         1.113231e+00        1.605415e+00
## 18         1.095913e+00        1.742924e-01
## 19         8.233861e-01        1.439834e+00
## 20         1.254385e+00        9.277913e-01
## 21         8.371047e-01        1.219237e-01
## 22         1.112651e+00        6.228825e-01
## 23         1.758208e+00        1.532253e-01
## 24         7.555391e-01        5.339181e-03
## 25         1.133062e+00        1.374045e-02
## 26         8.702431e-01        2.854745e-01
## 27         9.610474e-01        4.277260e-02
## 28         1.074235e+00        2.026822e-01
## 29         1.045682e+00        7.155516e-02
## 30         1.172158e+00        8.319405e-02
## 31         1.046228e+00        1.856695e-01
## 32         7.663165e-01        1.309347e-01
## 33         1.561980e+00        2.346322e+00
## 34         1.366864e+00        3.373797e+00
## 35         1.016960e+00        1.544702e-01
## 36         1.113454e+00        3.098384e-02
## 37         9.554283e-01        3.192156e-01
## 38         1.459063e+00        2.558709e-01
## 39         1.348437e+00        5.962905e-01
## 40         1.075358e+00        5.273048e-02
## 41         1.194214e+00        1.423092e-01
## 42         7.886749e-01        3.984922e+00
## 43         1.489595e+00        1.674178e-01
## 44         8.815162e-01        1.627170e-01
## 45         1.043236e+00        9.516004e-01
## 46         9.785494e-01        3.687667e-01
## 47         1.299059e+00        9.156243e-01
## 48         1.309586e+00        1.088123e-01
## 49         1.135386e+00        7.263971e-01
## 50         1.109448e+00        6.195362e-05
## 51         2.772937e-01        5.431777e-01
## 52         1.221757e-01        2.578810e-01
## 53         3.517859e-01        2.770315e-01
## 54         3.792875e-02        2.244953e+00
## 55         2.642103e-01        3.168336e-02
## 56         3.451041e-02        2.567277e-01
## 57         1.273045e-01        4.358417e-01
## 58         5.424787e-02        2.502829e+00
## 59         1.966769e-01        7.574657e-04
## 60         2.982306e-05        7.798382e-01
## 61         2.773852e-03        5.137759e+00
## 62         4.436317e-02        2.922062e-03
## 63         7.217543e-02        2.271443e+00
## 64         1.182730e-01        2.529427e-02
## 65         2.541344e-04        1.405670e-01
## 66         1.750530e-01        1.890135e-01
## 67         2.802269e-02        2.810869e-02
## 68         5.761099e-03        4.576183e-01
## 69         3.428372e-01        1.919466e+00
## 70         6.212765e-03        1.237589e+00
## 71         1.243050e-01        1.147073e-01
## 72         5.181878e-02        1.270244e-01
## 73         3.479405e-01        6.353522e-01
## 74         9.148759e-02        1.264573e-01
## 75         1.127824e-01        2.932841e-03
## 76         1.746000e-01        4.587544e-02
## 77         3.606456e-01        4.353241e-03
## 78         4.215101e-01        8.006110e-02
## 79         1.009559e-01        3.722954e-02
## 80         3.702260e-04        8.175402e-01
## 81         3.907804e-03        1.780169e+00
## 82         1.256419e-04        1.803499e+00
## 83         1.332666e-02        4.406326e-01
## 84         2.571921e-01        2.930298e-01
## 85         1.145943e-02        6.040177e-02
## 86         4.206739e-02        5.215066e-01
## 87         2.512321e-01        1.987812e-01
## 88         2.492254e-01        1.395036e+00
## 89         1.106171e-03        3.514231e-02
## 90         1.835633e-02        1.288873e+00
## 91         1.779098e-02        9.149687e-01
## 92         8.910690e-02        4.530537e-04
## 93         2.587009e-02        7.125528e-01
## 94         2.996446e-02        2.973877e+00
## 95         1.902395e-02        5.340992e-01
## 96         1.906636e-03        2.394565e-02
## 97         1.184513e-02        1.080660e-01
## 98         7.588915e-02        1.749455e-02
## 99         4.577829e-02        1.738304e+00
## 100        1.505583e-02        2.615693e-01
## 101        7.772113e-01        5.525952e-01
## 102        3.062511e-01        3.562384e-01
## 103        1.110891e+00        2.303758e-01
## 104        4.737675e-01        1.610328e-03
## 105        7.969220e-01        6.349274e-02
## 106        1.729841e+00        4.672746e-01
## 107        3.076971e-02        1.778417e+00
## 108        1.210949e+00        1.287011e-01
## 109        9.198318e-01        3.691671e-01
## 110        1.166489e+00        2.691581e+00
## 111        4.250988e-01        3.500332e-01
## 112        5.867354e-01        1.297048e-01
## 113        8.107097e-01        1.282016e-01
## 114        3.627186e-01        9.852693e-01
## 115        4.920299e-01        1.426679e-01
## 116        5.775450e-01        3.335462e-01
## 117        4.945719e-01        4.765889e-02
## 118        1.344772e+00        4.767541e+00
## 119        2.503732e+00        2.305993e-04
## 120        3.648238e-01        2.124641e+00
## 121        9.484988e-01        6.046122e-01
## 122        2.184791e-01        2.384417e-01
## 123        1.917969e+00        1.247945e-01
## 124        4.060326e-01        1.693175e-01
## 125        6.607271e-01        7.498200e-01
## 126        8.724563e-01        7.407599e-01
## 127        3.154291e-01        7.301395e-02
## 128        2.380997e-01        3.019893e-03
## 129        7.305574e-01        2.560398e-02
## 130        7.933721e-01        2.306056e-01
## 131        1.355462e+00        4.903441e-02
## 132        1.213568e+00        5.030886e+00
## 133        7.925683e-01        2.325227e-02
## 134        2.835535e-01        6.258264e-02
## 135        3.302937e-01        4.800952e-01
## 136        1.789303e+00        5.354394e-01
## 137        5.675483e-01        8.328428e-01
## 138        4.140263e-01        1.301544e-01
## 139        1.953747e-01        2.163569e-04
## 140        7.835243e-01        3.334311e-01
## 141        9.272946e-01        2.748673e-01
## 142        8.261745e-01        3.468260e-01
## 143        3.062511e-01        3.562384e-01
## 144        9.511464e-01        5.489182e-01
## 145        9.120198e-01        8.028580e-01
## 146        7.992200e-01        1.092179e-01
## 147        5.591717e-01        5.864482e-01
## 148        5.285732e-01        5.280510e-02
## 149        4.304832e-01        7.458799e-01
## 150        2.108071e-01        4.318091e-04

The contribution of an individual to a principal component (for instance PC1 or PC2) quantifies how much that individual influences the component’s direction. Higher contributions indicate that the individual has a stronger effect on shaping the component’s axis. This is useful in finding which individuals are most significant in defining the structure of the data along the principal components.

Ex 16

pca_scores <- pca_result$x[, 1:2]
colnames(pca_scores) <- c("PC1", "PC2")  

#K-Means Clustering
k <- 3
kmeans_result <- kmeans(pca_scores, centers = k)

pca_df <- data.frame(pca_scores, Cluster = as.factor(kmeans_result$cluster), Species = iris$Species)

centers <- as.data.frame(kmeans_result$centers)
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3) +
  geom_point(data = centers, aes(x = PC1, y = PC2), color = "black", size = 5, shape = 4) +  # Add centroids
  labs(title = "K-Means Clustering for the Iris Dataset on PCA",
       x = "PC1",
       y = "PC2") +
  theme_minimal()

#Calculate means of quantitative variables for each cluster
means_clusters <- aggregate(iris[, -5], by = list(Cluster = kmeans_result$cluster), FUN = mean)

#Calculate proportions of species in each cluster
species_distribution <- as.data.frame(table(kmeans_result$cluster, iris$Species))
proportions <- species_distribution %>%
  group_by(Var1) %>%
  mutate(Percentage = Freq / sum(Freq) * 100)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Ex 17

iris_data <- iris %>% select(-Species)

# Rescale the data (standardizing)
iris_rescaled <- scale(iris_data)

# Visualize the Elbow method using WSS
fviz_nbclust(
  x = iris_rescaled,     
  FUNcluster = kmeans,      
  method = "wss",         
  k.max = 10,               
  nstart = 10           
) + 
  labs(title = "Elbow Method for Optimal Clusters") +  
  xlab("Number of Clusters (k)") +                     
  ylab("Within-cluster sum of squares") +              
  theme_bw() +                                         
  theme(plot.title = element_text(hjust = 0.5))         

## Installing package into 'C:/Users/CYTech Student/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'vegan' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'vegan'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\CYTech
## Student\AppData\Local\R\win-library\4.3\00LOCK\vegan\libs\x64\vegan.dll to
## C:\Users\CYTech
## Student\AppData\Local\R\win-library\4.3\vegan\libs\x64\vegan.dll: Permission
## denied
## Warning: restored 'vegan'
## 
## The downloaded binary packages are in
##  C:\Users\CYTech Student\AppData\Local\Temp\RtmpyCrgXF\downloaded_packages
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.3.3
## Loading required package: lattice
library(factoextra)
library(ggplot2)

fviz_nbclust(
  x = iris_rescaled,            
  FUNcluster = kmeans,        
  method = "silhouette",
  k.max = 10                   
) + 
  labs(title = "Silhouette Method for Optimal Clusters",   
       x = "Number of Clusters (k)",                    
       y = "Average Silhouette Width") +                 
  theme_minimal() +                                       
  theme(plot.title = element_text(hjust = 0.5))           

install.packages("NbClust")
## Installing package into 'C:/Users/CYTech Student/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'NbClust' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\CYTech Student\AppData\Local\Temp\RtmpyCrgXF\downloaded_packages
library(NbClust)
#finding optimal number of clusters using NbClust
nb_result <- NbClust(pca_scores, min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 9 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 3 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
print(nb_result$Best.nc)
##                      KL       CH Hartigan    CCC    Scott  Marriot   TrCovW
## Number_clusters  9.0000   9.0000   3.0000 3.0000   3.0000    5.000     3.00
## Value_Index     14.0393 295.9081  63.7653 5.8556 130.2769 8642.763 17538.55
##                  TraceW Friedman   Rubin Cindex     DB Silhouette   Duda
## Number_clusters  3.0000   8.0000  9.0000 3.0000 2.0000     2.0000 2.0000
## Value_Index     56.3319   9.5436 -1.6317 0.2113 0.6485     0.6145 1.7818
##                 PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
## Number_clusters   2.0000  2.0000    3.0000  3.0000     2.0000 3.0000  2.0000
## Value_Index     -44.3151 -0.4298    0.4615 59.9592     0.7896 1.4462  0.3202
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 2.0000      0  2.0000      0 10.0000
## Value_Index     0.2504      0  1.3813      0  0.1838
#Barplot to show the frequency of optimal clusters suggested by different indices
barplot(table(nb_result$Best.nc), 
        xlab = "Number of Clusters", 
        ylab = "Frequency", 
        main = "Optimal Num clusters using NbClust")

library(FactoMineR)
library(ggplot2)

Part 4 Ex1.)

data("decathlon")

str(decathlon)
## 'data.frame':    41 obs. of  13 variables:
##  $ 100m       : num  11 10.8 11 11 11.3 ...
##  $ Long.jump  : num  7.58 7.4 7.3 7.23 7.09 7.6 7.3 7.31 6.81 7.56 ...
##  $ Shot.put   : num  14.8 14.3 14.8 14.2 15.2 ...
##  $ High.jump  : num  2.07 1.86 2.04 1.92 2.1 1.98 2.01 2.13 1.95 1.86 ...
##  $ 400m       : num  49.8 49.4 48.4 48.9 50.4 ...
##  $ 110m.hurdle: num  14.7 14.1 14.1 15 15.3 ...
##  $ Discus     : num  43.8 50.7 49 40.9 46.3 ...
##  $ Pole.vault : num  5.02 4.92 4.92 5.32 4.72 4.92 4.42 4.42 4.92 4.82 ...
##  $ Javeline   : num  63.2 60.1 50.3 62.8 63.4 ...
##  $ 1500m      : num  292 302 300 280 276 ...
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Points     : int  8217 8122 8099 8067 8036 8030 8004 7995 7802 7733 ...
##  $ Competition: Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
decathlon_numeric <- decathlon[, sapply(decathlon, is.numeric)]

#Standardize
decathlon_scaled <- scale(decathlon_numeric)

pca_result <- prcomp(decathlon_scaled, center = TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.1815 1.3191 1.1895 1.06385 0.92841 0.77931 0.71446
## Proportion of Variance 0.3966 0.1450 0.1179 0.09431 0.07183 0.05061 0.04254
## Cumulative Proportion  0.3966 0.5416 0.6595 0.75380 0.82563 0.87624 0.91878
##                            PC8    PC9    PC10    PC11    PC12
## Standard deviation     0.64116 0.4850 0.43286 0.37545 0.00735
## Proportion of Variance 0.03426 0.0196 0.01561 0.01175 0.00000
## Cumulative Proportion  0.95303 0.9726 0.98825 1.00000 1.00000
pca_loadings <- pca_result$rotation

correlation_circle <- as.data.frame(pca_loadings)

radius <- 1

theta <- seq(0, 2 * pi, length.out = 100)
circle <- data.frame(x = radius * cos(theta), y = radius * sin(theta))

ggplot(circle, aes(x = x, y = y)) +
  geom_path(color = "lightgrey") +
  geom_segment(data = correlation_circle, aes(x = 0, y = 0, 
                                               xend = PC1, yend = PC2), 
               arrow = arrow(length = unit(0.2, "inches")), 
               color = "blue") +
  geom_text(data = correlation_circle, aes(x = PC1, y = PC2, label = rownames(correlation_circle)), 
            vjust = 1.5, color = "red") +
  labs(title = "Decathlon - Circle of Correlations PCA", 
       x = "Principal Component 1", 
       y = "Principal Component 2") +
  coord_fixed() +
  theme_minimal()

Ex2.)

data("decathlon")

str(decathlon)
## 'data.frame':    41 obs. of  13 variables:
##  $ 100m       : num  11 10.8 11 11 11.3 ...
##  $ Long.jump  : num  7.58 7.4 7.3 7.23 7.09 7.6 7.3 7.31 6.81 7.56 ...
##  $ Shot.put   : num  14.8 14.3 14.8 14.2 15.2 ...
##  $ High.jump  : num  2.07 1.86 2.04 1.92 2.1 1.98 2.01 2.13 1.95 1.86 ...
##  $ 400m       : num  49.8 49.4 48.4 48.9 50.4 ...
##  $ 110m.hurdle: num  14.7 14.1 14.1 15 15.3 ...
##  $ Discus     : num  43.8 50.7 49 40.9 46.3 ...
##  $ Pole.vault : num  5.02 4.92 4.92 5.32 4.72 4.92 4.42 4.42 4.92 4.82 ...
##  $ Javeline   : num  63.2 60.1 50.3 62.8 63.4 ...
##  $ 1500m      : num  292 302 300 280 276 ...
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Points     : int  8217 8122 8099 8067 8036 8030 8004 7995 7802 7733 ...
##  $ Competition: Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
#Remove any non numeric columns if present
decathlon_numeric <- decathlon[, sapply(decathlon, is.numeric)]

decathlon_scaled <- scale(decathlon_numeric)

pca_result <- prcomp(decathlon_scaled, center = TRUE, scale. = TRUE)

pca_scores <- as.data.frame(pca_result$x)

#Calculate quality of representation (squared cosines)
quality_representation <- rowSums(pca_scores^2)

plot_data <- pca_scores %>%
  mutate(quality = quality_representation)

#Bubble plot
ggplot(plot_data, aes(x = PC1, y = PC2)) +
  geom_point(aes(size = quality), alpha = 0.7, color = "turquoise") +
  scale_size(range = c(1, 10), name = "Quality of Representation") +  # Adjust size range as needed
  labs(title = "Individuals in PCA Factorial Plane",
       x = "PC1",
       y = "PC2") +
  theme_minimal()

decathlon_numeric <- decathlon[, sapply(decathlon, is.numeric)]
decathlon_scaled <- scale(decathlon_numeric)

pca_result <- prcomp(decathlon_scaled, center = TRUE, scale. = TRUE)

pca_summary <- summary(pca_result)

#Extract variance explained for the first five components
variance_explained <- round(pca_summary$importance[2, 1:5] * 100, 3)  #getting percentage so times 100

#Extract loadings for the first five components and round them
loadings_table <- round(as.data.frame(pca_result$rotation[, 1:5]), 3)

variance_table <- data.frame(
  Component = paste0("PC", 1:5),
  Variance_Explained = variance_explained
)
loadings <- as.data.frame(round(pca_result$rotation[, 1:5], 3))

loadings$Variable <- rownames(loadings)
loadings <- loadings[, c("Variable", "PC1", "PC2", "PC3", "PC4", "PC5")]
library(ggplot2)

data("decathlon")

str(decathlon)
## 'data.frame':    41 obs. of  13 variables:
##  $ 100m       : num  11 10.8 11 11 11.3 ...
##  $ Long.jump  : num  7.58 7.4 7.3 7.23 7.09 7.6 7.3 7.31 6.81 7.56 ...
##  $ Shot.put   : num  14.8 14.3 14.8 14.2 15.2 ...
##  $ High.jump  : num  2.07 1.86 2.04 1.92 2.1 1.98 2.01 2.13 1.95 1.86 ...
##  $ 400m       : num  49.8 49.4 48.4 48.9 50.4 ...
##  $ 110m.hurdle: num  14.7 14.1 14.1 15 15.3 ...
##  $ Discus     : num  43.8 50.7 49 40.9 46.3 ...
##  $ Pole.vault : num  5.02 4.92 4.92 5.32 4.72 4.92 4.42 4.42 4.92 4.82 ...
##  $ Javeline   : num  63.2 60.1 50.3 62.8 63.4 ...
##  $ 1500m      : num  292 302 300 280 276 ...
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Points     : int  8217 8122 8099 8067 8036 8030 8004 7995 7802 7733 ...
##  $ Competition: Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
decathlon_numeric <- decathlon[, sapply(decathlon, is.numeric)]

decathlon_scaled <- scale(decathlon_numeric)

pca_result <- prcomp(decathlon_scaled, center = TRUE, scale. = TRUE)

pca_summary <- summary(pca_result)

pca_loadings <- as.data.frame(pca_result$rotation)

variance_explained <- pca_summary$importance[2, 1:5] * 100  

component_names <- c("Athleticism", "Upper Strength", "Jumping", "Endurance", "Technique")
justifications <- c(
  "There are High loadings across the running events for instance in 100m, 400m or 1500m, meaning that their is mix of both speed and endurance.",
  
  "There are also high loadings on throwing events like Shot Put and Javelin, which show strength.",
  
  "Next, there is high loadings on Long Jump and High Jump which are jumping events.",
  
  "There is high loading specifically on 1500m which demonstrate a athelete stamina.",
  "Lastly, there are loadings on very mechanical and technique based events like Pole Vault, where the athlete skill and technique are important."
)

result_table <- data.frame(
  Component = paste0("PC", 1:5),
  Name = component_names,
  Variance_Explained = round(variance_explained, 2),
  Justification = justifications
)

print(result_table)
##     Component           Name Variance_Explained
## PC1       PC1    Athleticism              39.66
## PC2       PC2 Upper Strength              14.50
## PC3       PC3        Jumping              11.79
## PC4       PC4      Endurance               9.43
## PC5       PC5      Technique               7.18
##                                                                                                                                      Justification
## PC1  There are High loadings across the running events for instance in 100m, 400m or 1500m, meaning that their is mix of both speed and endurance.
## PC2                                                There are also high loadings on throwing events like Shot Put and Javelin, which show strength.
## PC3                                                              Next, there is high loadings on Long Jump and High Jump which are jumping events.
## PC4                                                              There is high loading specifically on 1500m which demonstrate a athelete stamina.
## PC5 Lastly, there are loadings on very mechanical and technique based events like Pole Vault, where the athlete skill and technique are important.